nipah_RBP_entry_analysis.ipynb¶

This notebook pulls in data on Nipah RBP entry into CHO-bEFNB2 and bCHO-EFNB3 cells, calculates stats, and makes figures¶

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
nipah_config = None
altair_config = None
surface = None

#input files
e2_distances_file = None
func_scores_E2_file = None
func_scores_E3_file = None
concat_df_file = None
merged_df_file = None

#output files
contact_type_plot = None
E2_E3_entry_corr_plot = None
E2_E3_entry_all_muts_plot = None
combined_E2_E3_correlation_plots = None
entry_region_boxplot_plot = None
combined_region_barplot_output = None
In [2]:
# Parameters
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
e2_distances_file = "results/distances/2vsm_distances.csv"
contact_type_plot = "results/images/contact_type_plot.html"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr_plot.html"
E2_E3_entry_all_muts_plot = "results/images/E2_E3_entry_all_muts_plot.html"
combined_E2_E3_correlation_plots = (
    "results/images/combined_E2_E3_correlation_plots.html"
)
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entry_region_boxplot_plot = "results/images/entry_region_boxplot_plot.html"
surface = "data/custom_analyses_data/surface_exposure.csv"
combined_region_barplot_output = "results/images/combined_region_barplot_output.html"
In [3]:
import math
import os
import re
import altair as alt

import numpy as np

import pandas as pd

import scipy.stats

#import Bio.SeqIO

import yaml

#from Bio import AlignIO
#from Bio import PDB
#from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory

Paths for running notebook interactively¶

In [5]:
if nipah_config is None:
    e2_distances_file = "results/distances/2vsm_distances.csv"
    func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
    func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
    
    concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
    merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
    
    nipah_config = "nipah_config.yaml"
    altair_config = "data/custom_analyses_data/theme.py"
    
    surface = "data/custom_analyses_data/surface_exposure.csv"

Read in custom altair theme and import YAML file with parameters¶

In [6]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import filtered data¶

In [7]:
#Import filtered entry scores from different selections
func_scores_E2 = pd.read_csv(func_scores_E2_file)
func_scores_E3 = pd.read_csv(func_scores_E3_file)
concat_df = pd.read_csv(concat_df_file) #concatanated entry scores
merged_df = pd.read_csv(merged_df_file) #merged entry scores

Calculate some basic stats about E2 and E3 entry scores¶

In [8]:
def calculate_stats(df, name):
    print(f"For {name}:")
    total_mut = (532) * 19 #total number of possible amino acids
    print(f'There are {total_mut} amino acid mutations possible')
    muts_present = df["effect"].shape[0]
    fraction_muts = muts_present / total_mut
    print(
        f"fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}"
    )

    # Break mutations into bins (negative, neutral, positive) and calculate how many mutants there are
    deleterious_muts = df[df["effect"] <= -0.5].shape[0] 
    neutral_muts = df[(df["effect"] > -0.5) & (df["effect"] < 0.5)].shape[0]
    positive_muts = df[df["effect"] > 0.5].shape[0]

    frac_bad_muts = deleterious_muts / muts_present
    frac_neutral_muts = neutral_muts / muts_present
    frac_pos_muts = positive_muts / muts_present
    print(
        f"The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}"
    )
    print(
        f"The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}"
    )
    print(
        f"The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}\n"
    )

calculate_stats(func_scores_E2, "CHO-bEFNB2")
calculate_stats(func_scores_E3, "CHO-bEFNB3")
For CHO-bEFNB2:
There are 10108 amino acid mutations possible
fraction muts present in CHO-bEFNB2 is 0.97 9786/10108
The number of deleterious mutants for CHO-bEFNB2 is 0.44 4296/9786
The number of neutral mutants for CHO-bEFNB2 is 0.54 5303/9786
The number of positive mutants for CHO-bEFNB2 is 0.02 186/9786

For CHO-bEFNB3:
There are 10108 amino acid mutations possible
fraction muts present in CHO-bEFNB3 is 0.96 9713/10108
The number of deleterious mutants for CHO-bEFNB3 is 0.43 4182/9713
The number of neutral mutants for CHO-bEFNB3 is 0.56 5451/9713
The number of positive mutants for CHO-bEFNB3 is 0.01 80/9713

Find which sites only have negative entry scores for all mutants¶

In [9]:
def overall_stats_all_neg(df,effect,name,region=None):
    print(f'We are analyzing data for {name}:')
    
    if region is None:
        contact_df = df
    else:
        contact_df = df[df['site'].isin(region)]
    
    filtered_df = contact_df.groupby('site').filter(lambda group: (group['effect'] < 0).all())
    unique = filtered_df['site'].unique()
    print(list(unique))
    total_sites = contact_df['site'].unique().shape[0]
    subset = filtered_df['site'].unique().shape[0]
       
    fraction = subset/total_sites
    percent = fraction * 100
    print(f'The total number of sites are: {total_sites}\n')
    print(f' The number of sites where all mutants are negative for {effect}: {subset}\n')
    print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}\n')
    return unique

intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect','CHO-bEFNB2'))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect','CHO-bEFNB3'))
We are analyzing data for CHO-bEFNB2:
[79, 80, 95, 97, 106, 107, 111, 112, 113, 120, 121, 125, 126, 127, 128, 130, 138, 146, 151, 157, 158, 159, 160, 161, 162, 163, 165, 167, 172, 189, 203, 205, 206, 207, 208, 216, 229, 233, 240, 246, 251, 253, 254, 257, 258, 259, 260, 262, 263, 264, 266, 267, 298, 303, 320, 323, 331, 347, 355, 382, 387, 395, 412, 439, 454, 460, 467, 487, 489, 493, 499, 500, 503, 506, 510, 537, 563, 565, 573, 574, 575, 594, 598]
The total number of sites are: 532

 The number of sites where all mutants are negative for effect: 83

 The percent of sites where all mutants are negative for effect: 16

We are analyzing data for CHO-bEFNB3:
[100, 104, 107, 108, 109, 111, 112, 113, 120, 121, 126, 138, 146, 158, 159, 162, 163, 206, 207, 208, 216, 220, 229, 236, 240, 243, 251, 253, 257, 258, 259, 260, 263, 266, 276, 303, 347, 352, 355, 368, 382, 387, 389, 395, 412, 438, 439, 458, 460, 467, 486, 487, 488, 489, 493, 494, 499, 500, 501, 503, 504, 505, 506, 510, 526, 531, 532, 533, 537, 556, 557, 563, 565, 573, 574, 579, 581, 584, 588, 594]
The total number of sites are: 532

 The number of sites where all mutants are negative for effect: 80

 The percent of sites where all mutants are negative for effect: 15

Make bubble plots of receptor contact site type (median values per site)¶

In [10]:
def make_bubbleplot_entry_region(df):  # Create a bubble plot using Altair for contact site mutants
    barrel_ranges = {
        "Hydrophobic": config["hydrophobic"],
        "Salt Bridges": config["salt_bridges"],
        "Hydrogen Bonds": config["h_bond_total"],
        "Contact": config["contact_sites"],
    }
    custom_order = [
        "Hydrophobic",
        "Salt Bridges",
        "Hydrogen Bonds",
        "Contact",
    ]
    empty_chart = []
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, fields=["site"], value=1
    )
    
    for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
        agg_means = []
        tmp_df = df[df["cell_type"] == selection]
        mean_df = tmp_df.groupby("site")[["effect"]].mean().reset_index()

        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = mean_df[mean_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"barrel": barrel, "effect": row["effect"], "site": row["site"]}
                )
        agg_means_df = pd.DataFrame(agg_means)
        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_point(size=200,filled=True)
            .encode(
                x=alt.X(
                    "barrel:O",
                    sort=custom_order,
                    title='Contact Type',
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "effect",
                    title="Cell Entry",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                xOffset="random:Q",
                tooltip=["barrel", "effect", "site"],
                size=alt.condition(
                    variant_selector, alt.value(100), alt.value(50)
                ),
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.3)),
            ).properties(height=200,width=200)
            .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
        )
        empty_chart.append(chart)
    combined_effect_chart = (
        alt.hconcat(*empty_chart)
        .resolve_scale(y="shared", x="shared", color="independent")
        .add_params(variant_selector)
    )
    return combined_effect_chart


tmp_img = make_bubbleplot_entry_region(concat_df)
tmp_img.display()
if entry_region_boxplot_plot is not None:
    tmp_img.save(contact_type_plot)

Make bubble plots depending on amino acid property¶

In [11]:
def make_bubbleplot_wildtype_prop(df):
    barrel_ranges = {
        "hydrophobic": list(["A", "V", "L", "I", "M"]),
        "aromatic": list(["Y", "W", "F"]),
        "positive": list(["K", "R", "H"]),
        "negative": list(["E", "D"]),
        "hydrophilic": list(["S", "T", "N", "Q"]),
        "special": list(["C", "P", "G"]),
    }
    empty_charts = []
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site"], value=1
    )
    for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
        if selection == "CHO-bEFNB2":
            effect_name = "EFNB2"
        else:
            effect_name = "EFNB3"

        tmp_df = df[df["cell_type"] == selection]

        unique_wildtype_df = tmp_df[["site", "wildtype"]].drop_duplicates()
        mean_df = tmp_df.groupby("site")[["effect"]].mean().reset_index()
        mean_df = pd.merge(mean_df, unique_wildtype_df, on="site", how="left")

        agg_means = []

        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = mean_df[mean_df["wildtype"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"wildtype_class": barrel, "effect": row["effect"], "site": row["site"], "wildtype": row["wildtype"]}
                )
        agg_means_df = pd.DataFrame(agg_means)

        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_point(size=100,filled=True)
            .encode(
                x=alt.X(
                    "wildtype_class:O",
                    title="Wildtype amino-acid property",
                    axis=alt.Axis(labelAngle=-90),
                ),  
                y=alt.Y(
                    "effect",
                    title=f"Cell Entry",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                xOffset="random:Q",
                tooltip=["wildtype_class", "effect", "site","wildtype"],
                size=alt.condition(variant_selector, alt.value(120), alt.value(50)),
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
            ).properties(width=200,height=200)
            .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
            
        )
        empty_charts.append(chart)
    combined_effect_chart = (
        alt.hconcat(*empty_charts)
        .resolve_scale(y="shared", x="shared", color="independent")
        .add_params(variant_selector)
    )
    return combined_effect_chart


wildtype_aa_bubble_img = make_bubbleplot_wildtype_prop(concat_df)
wildtype_aa_bubble_img.display()

Plot correlations between E2 and E3 entry¶

In [12]:
# Import distance data
e2_distances = pd.read_csv(e2_distances_file)
distance_df = pd.merge(
    merged_df, e2_distances[["site", "distance"]], on="site", how="left"
)

def determine_distance(df):
    # Define the conditions
    conditions = [
        df["distance"] < 4,
        (df["distance"] >= 4) & (df["distance"] <= 8),
        df["distance"] > 8,
    ]

    # Define the associated values for the conditions
    choices = ["contact", "close", "distant"]

    # Apply the conditions and choices to the 'E2_contact' column
    df["contact"] = np.select(conditions, choices, default="distant")
    return df


distance_df = determine_distance(distance_df)
display(distance_df)
site wildtype mutant effect_E2 effect_std_E2 times_seen_E2 n_selections_E2 cell_type_E2 wildtype_site_E2 wt_type_E2 ... effect_E3 effect_std_E3 times_seen_E3 n_selections_E3 cell_type_E3 wildtype_site_E3 wt_type_E3 mutant_type_E3 distance contact
0 71 Q C -1.750 0.1777 4.625 8.0 CHO-bEFNB2 Q71 hydrophilic ... -0.72270 0.7828 3.000 7.0 CHO-bEFNB3 Q71 hydrophilic special NaN distant
1 71 Q D -1.164 0.8890 4.500 8.0 CHO-bEFNB2 Q71 hydrophilic ... -0.38840 0.6369 3.429 7.0 CHO-bEFNB3 Q71 hydrophilic negative NaN distant
2 71 Q E -1.255 0.3123 5.375 8.0 CHO-bEFNB2 Q71 hydrophilic ... -0.24820 0.9791 4.571 7.0 CHO-bEFNB3 Q71 hydrophilic negative NaN distant
3 71 Q F -1.058 0.6637 4.625 8.0 CHO-bEFNB2 Q71 hydrophilic ... -0.49730 0.3080 3.286 7.0 CHO-bEFNB3 Q71 hydrophilic aromatic NaN distant
4 71 Q G -1.425 0.5878 7.875 8.0 CHO-bEFNB2 Q71 hydrophilic ... -1.33100 0.8316 4.714 7.0 CHO-bEFNB3 Q71 hydrophilic special NaN distant
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9934 595 V T NaN NaN NaN NaN NaN NaN NaN ... -0.32370 0.5694 2.143 7.0 CHO-bEFNB3 V595 hydrophobic hydrophilic 17.838812 distant
9935 599 E A NaN NaN NaN NaN NaN NaN NaN ... 0.17320 0.4067 3.143 7.0 CHO-bEFNB3 E599 negative hydrophobic 30.234262 distant
9936 600 Q V NaN NaN NaN NaN NaN NaN NaN ... 0.31020 0.1140 3.571 7.0 CHO-bEFNB3 Q600 hydrophilic hydrophobic 30.670734 distant
9937 601 C I NaN NaN NaN NaN NaN NaN NaN ... -0.75770 0.9218 2.286 7.0 CHO-bEFNB3 C601 special hydrophobic 29.834501 distant
9938 601 C V NaN NaN NaN NaN NaN NaN NaN ... 0.01403 0.7747 3.000 4.0 CHO-bEFNB3 C601 special hydrophobic 29.834501 distant

9939 rows × 21 columns

In [13]:
def median_correlation_plot(df, metric):
    aggregation = getattr(df.groupby('site')[["effect_E2", "effect_E3"]], metric)
    means = aggregation().reset_index()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["effect_E2"], means["effect_E3"]
    )
    display(r_value.round(2))

    means = means.rename(
        columns={"effect_E2": f"{metric}_E2", "effect_E3": f"{metric}_E3"}
    )

    contact_sites = df[["site", "contact", "wildtype"]].drop_duplicates()
    df_mean = pd.merge(means, contact_sites, on="site", how="left")

    chart = (
        alt.Chart(df_mean)
        .mark_point(filled=True,size=60,stroke='black',strokeWidth=1)
        .encode(
            x=alt.X(f"{metric}_E2", title="Entry in CHO-bEFNB2"),
            y=alt.Y(f"{metric}_E3", title="Entry in CHO-bEFNB3"),
            tooltip=["site", "wildtype"],
            color=alt.Color(
                "contact",
                scale=alt.Scale(
                    domain=["contact", "close", "distant"],
                    range=["#1f4e79", "#ff7f0e", "gray"],
                ),
                legend=alt.Legend(title="Receptor distance"),
            ),
        )
    ).properties(width=200,height=200)
    min_ = int(df_mean[f"{metric}_E2"].min())
    max_ = int(df_mean[f"{metric}_E3"].max())
    text = (
        alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
        .mark_text(
            align="left",
            baseline="top",
            dx=-20,  # Adjust this for position
            dy=-20,  # Adjust this for position
        )
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    plot = chart + text
    return plot


E2_E3_plot = median_correlation_plot(distance_df, "mean")
E2_E3_plot.display()
if entry_region_boxplot_plot is not None:
    E2_E3_plot.save(E2_E3_entry_corr_plot)
0.82
In [14]:
def correlation_plot(df):
    df = df.dropna()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["effect_E2"], df["effect_E3"]
    )
    display(r_value.round(2))
    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["contact"], value=1
    )
    chart = (
        alt.Chart(df)
        .mark_point(size=40,opacity=1,filled=True)
        .encode(
            x=alt.X("effect_E2", title="Entry in CHO-bEFNB2"),
            y=alt.Y("effect_E3", title="Entry in CHO-bEFNB3"),
            tooltip=["site", "wildtype", "mutant"],
            color=alt.Color(
                "contact",
                scale=alt.Scale(
                    domain=["contact", "close", "distant"],
                    range=["#1f4e79", "#ff7f0e", "gray"],
                ),
                legend=alt.Legend(title="RBP Distance to Receptor"),
            ),
            opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
        ).add_params(variant_selector)
    )
    min_ = int(df['effect_E2'].min())
    max_ = int(df['effect_E3'].max())
    
    text = (
        alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
        .mark_text(
            align="left",
            baseline="top",
            dx=-20,  # Adjust this for position
            dy=-20,  # Adjust this for position
        )
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    plot = chart + text

    return plot


tmp_img_corr = correlation_plot(distance_df)
tmp_img_corr.display()
if entry_region_boxplot_plot is not None:
    tmp_img_corr.save(E2_E3_entry_all_muts_plot)

if entry_region_boxplot_plot is not None:
    (E2_E3_plot | tmp_img_corr).save(combined_E2_E3_correlation_plots)
0.79

Make boxplot showing median entry by RBP region¶

In [15]:
def make_boxplot_entry_region(df):
    barrel_ranges = {
        "Stalk": list(range(70, 147)),
        "Neck": list(range(148, 165)),
        "Linker": list(range(166, 177)),
        "Head": list(range(178, 602)),
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
    empty_charts = []
    for selection in ["CHO-bEFNB2", "CHO-bEFNB3"]:
        if selection == "CHO-EFNB2":
            effect_name = "EFNB2"
        else:
            effect_name = "EFNB3"

        tmp_df = df[df["cell_type"] == selection]
        agg_means = []
        for barrel, sites in barrel_ranges.items():
            subset = tmp_df[tmp_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {"region": barrel, "effect": row["effect"], "site": row["site"]}
                )
            agg_means_df = pd.DataFrame(agg_means)
        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_boxplot(color='darkgray',extent='min-max')
            .encode(
                x=alt.X(
                    "region:O",
                    sort=custom_order,
                    title="RBP Region",
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "effect",
                    title=f"Cell Entry",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                tooltip=["region", "effect", "site"],
            ).properties(width=config['width'],height=config['height'])
        )
        empty_charts.append(chart)
    combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
        y="shared", x="shared", color="independent"
    )
    return combined_effect_chart


entry_region_boxplot = make_boxplot_entry_region(concat_df)
entry_region_boxplot.display()
if entry_region_boxplot_plot is not None:
    entry_region_boxplot.save(entry_region_boxplot_plot)

Check for potential neutral/beneficial glycosylation sites¶

In [16]:
def find_potential_glycan_sites(df):
    filtered = df[df["wildtype"].isin(["T", "S"])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[
            (df["site"] == row["site"] - 2) & (df["mutant"] == "N") & (df["effect"] > 0)
        ]
        if not prior_rows.empty:
            matching_sites.append(row["site"])
    unique_list1 = list(set(matching_sites))
    unique_list1 = [x - 2 for x in unique_list1]

    filtered = df[df["wildtype"].isin(["N"])]
    matching_sites = []
    for index, row in filtered.iterrows():
        # Check for existence of site two numbers prior with 'N' mutant and positive effect
        prior_rows = df[
            (df["site"] == row["site"] + 2)
            & (df["mutant"].isin(["T", "S"]))
            & (df["effect"] > 0)
        ]
        if not prior_rows.empty:
            matching_sites.append(row["site"])
    unique_list2 = list(set(matching_sites))
    unique_list = unique_list1 + unique_list2
    unique_list.sort()
    return unique_list
#call function
PNLG = find_potential_glycan_sites(func_scores_E3)
#read in surface exposure data to find potential glycans on surface of protein
surface_df = pd.read_csv(surface)
surface_df.rename(columns={"exposure_A": "Surface Exposure"}, inplace=True)
PNLG_SURFACE = surface_df[surface_df["site"].isin(PNLG)]
PNLG_SURFACE = list(
    PNLG_SURFACE[PNLG_SURFACE["Surface Exposure"] >= 25]["site"].unique()
)

#print(f"\nThe surface exposed PNLG sites are: {PNLG_SURFACE}\n")

glycans = config["glycans"] #these are the glycan sites that are already present in protein

#filter out known glycan sites already present
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]
print(f'These are {len(filtered_PNLG_SURFACE)} potential glycan sites one amino acid away: {filtered_PNLG_SURFACE}')
#print(len(filtered_PNLG_SURFACE))
These are 15 potential glycan sites one amino acid away: [180, 191, 192, 311, 326, 359, 383, 403, 423, 473, 478, 543, 554, 570, 600]

Make bar plot of average entry scores for CHO-bEFNB3¶

In [17]:
def entry_by_site(df):
    tmp_df = df.groupby('site')['effect'].mean().reset_index()
    
    # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 602)),
    }
    
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
    
    agg_means = [] #store aggregation 
    
    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = tmp_df[tmp_df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, "effect": row["effect"], "site": row["site"]}
            )
        agg_means_df = pd.DataFrame(agg_means).round(3)
    #display(agg_means_df)
    agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet']) #add a column specifying which sites are in beta sheets
    
    ### The main chart plotting
    chart = (
        alt.Chart(agg_means_df)
        .mark_bar(opacity=1)
        .encode(
            x=alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
            y=alt.Y("effect", title="Mean entry in CHO-bEFNB3"),
            tooltip=["site", "effect","region"],
            color=alt.Color('region',sort=custom_order,title='Region'),
        )
    ).properties(width=1000,height=200)
    
    ### Draw rectanges showing where beta sheets are in protein above chart
    rect = alt.Chart(agg_means_df.query('beta_sheet == True')).mark_rect(color='black').encode(
        alt.X('site:N',axis=None),
        alt.Y('beta_sheet',axis=None)
    ).properties(width=800,height=10)
    
    combined_chart = alt.vconcat(chart,padding=0).resolve_scale(y='independent',x='shared')
    

    return combined_chart

entry_by_site_plot = entry_by_site(func_scores_E3)
entry_by_site_plot.display()

Make bar charts of mean cell entry by site for different regions, sorted by average and colored by the unmutated amino acid at position¶

In [18]:
def entry_by_site_region(df,site_list,name_of_title):
    #calculate mean by site
    tmp_df = df.groupby(['site','cell_type'])['effect'].mean().reset_index().round(3) 
    
    #make a unique values df to merge
    unique_wildtypes_df = df.drop_duplicates(subset=['site','wildtype','wt_type','wildtype_site'])
    
    #merge mean and unique values
    tmp_df = pd.merge(tmp_df, unique_wildtypes_df[['site', 'wt_type','wildtype','wildtype_site']], on='site', how='left')

    #filter by site
    tmp_df = tmp_df[tmp_df['site'].isin(site_list)]

    #make chart
    chart = (
        alt.Chart(tmp_df,title=name_of_title)
        .mark_bar(opacity=1)
        .encode(
            x=alt.X("site:N", sort='-y',title='Site',axis=alt.Axis(labelAngle=-90)),
            y=alt.Y("effect", title="Mean entry"),
            tooltip=["site", "effect", 'wildtype', 'wt_type'],
            color=alt.Color('wt_type:N', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(title="Unmutated amino-acid type"))
        )
    ).properties(width=alt.Step(12), height=100)
    
    return chart

Ranked average entry in contact sites¶

In [19]:
entry_by_site_receptor_plot = entry_by_site_region(func_scores_E3,config['contact_sites'],'Contact Sites')
entry_by_site_receptor_plot.display()

Now make ranked averages for each region¶

Ranked average entry in RBP stalk¶

In [20]:
entry_by_site_receptor_plot_stalk = entry_by_site_region(func_scores_E3,list(range(70, 148)),'Stalk')
entry_by_site_receptor_plot_stalk.display()

Ranked average entry in RBP neck¶

In [21]:
entry_by_site_receptor_plot_neck = entry_by_site_region(func_scores_E3,list(range(147,175)),'Neck')
entry_by_site_receptor_plot_neck.display()

Ranked average entry in RBP linker¶

In [22]:
entry_by_site_receptor_plot_linker = entry_by_site_region(func_scores_E3,list(range(166, 178)),'Linker')
entry_by_site_receptor_plot_linker.display()
In [23]:
combined_region_barplot = alt.vconcat(entry_by_site_receptor_plot_stalk,entry_by_site_receptor_plot_neck,entry_by_site_receptor_plot_linker,entry_by_site_receptor_plot,padding=0).resolve_scale(y="shared", x="independent", color="shared")
combined_region_barplot.display()
combined_region_barplot.save(combined_region_barplot_output)
In [ ]:
 
In [ ]: